### importing required Libraries
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
import os
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
!pip install catboost
!pip install shap
import shap
from catboost import CatBoostClassifier, Pool
os.chdir('/content/drive/MyDrive/Colab Notebooks/jobathon')
## Reading Train Data
train=pd.read_csv('train_s3TEQDk.csv')
## Reading Test Data
test=pd.read_csv('test_mSzZ8RL.csv')
DATA TYPECASTING,MISSING-VALUES
def check_df(dataframe):
print("##################### Shape #####################")
print(dataframe.shape)
print("##################### Types #####################")
print(dataframe.dtypes)
print("##################### Head #####################")
print(dataframe.head(3))
print("##################### Tail #####################")
print(dataframe.tail(3))
print("##################### NA #####################")
print(dataframe.isnull().sum())
print("##################### Quantiles #####################")
print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
check_df(train)
check_df(test)
# removing nan which account for just above 10% base
train['Credit_Product']=train['Credit_Product'].fillna('NA')
test['Credit_Product']=test['Credit_Product'].fillna('NA')
print(train.isna().sum())
print(test.isna().sum())
## typecasting train and test features
category_cols=['Gender','Region_Code','Channel_Code','Occupation','Credit_Product','Is_Active']
numerical_cols=['Age','Vintage','Avg_Account_Balance']
target_cols=['Is_Lead']
for col in category_cols:
train[col]=train[col].astype('category')
test[col]=test[col].astype('category')
train[target_cols]=train[target_cols].astype('category')
for col in numerical_cols:
train[col]=train[col].astype('float')
test[col]=test[col].astype('float')
print(train.dtypes)
print(test.dtypes)
### Age and Vintage Distribution
print(train['Age'].min(),train['Age'].max())
print(test['Age'].min(),test['Age'].max())
print(train['Vintage'].min(),train['Vintage'].max())
print(test['Vintage'].min(),test['Vintage'].max())
print(train['Region_Code'].nunique(),test['Region_Code'].nunique())
train['Region_Code'].value_counts()
## function to club all categories with levels below 4000 to O
def make_etc(x):
if len(train[train['Region_Code']==x]) >= 5000:
return "Top"
else:
return x
# Replace with 'etc' if category count is less than 4000
#train['Region_Code'] = train['Region_Code'].apply(make_etc)
#test['Region_Code'] = test['Region_Code'].apply(make_etc)
### Target Value distribution
print(train[target_cols].value_counts(normalize=True)*100)
## No Class imbalance in train data
CONTINOUS VARIABLE DISTRIBUTION
PLOTS TO CHECK TARGET-Is_Lead AND OTHER CATEGORIES DISTRIBUTION
sns.displot((train['Avg_Account_Balance']))
sns.displot(test['Avg_Account_Balance'])
## Account Balance is right skewed in both train and test
# Age Variable Distribution
sns.displot((train['Age']))
sns.displot((test['Age']))
## Distribution of Vintage - converting to years
sns.displot((train['Vintage']/12))
sns.displot((test['Vintage']/12))
#Gender vs Target Variable CountPlot
sns.countplot(train['Gender'],hue=train['Is_Lead'])
## Is_Active vs Target Distribution
sns.countplot(train['Is_Active'],hue=train['Is_Lead'])
## Occupation vs Target
sns.countplot(train['Occupation'],hue=train['Is_Lead'])
## Checking unique Region_Code, Channel_Code
train['Region_Code'].nunique(),train['Channel_Code'].nunique()
sns.countplot(train['Channel_Code'],hue=train['Is_Lead'])
train['Avg_Account_Balance']=(train['Avg_Account_Balance']-train['Avg_Account_Balance'].min())/train['Avg_Account_Balance'].max()
test['Avg_Account_Balance']=(test['Avg_Account_Balance']-test['Avg_Account_Balance'].min())/test['Avg_Account_Balance'].max()
test['Age']=(test['Age']-test['Age'].min())/test['Age'].max()
train['Age']=(train['Age']-train['Age'].min())/train['Age'].max()
test['Vintage']=(test['Vintage']-test['Vintage'].min())/test['Vintage'].max()
train['Vintage']=(train['Vintage']-train['Vintage'].min())/train['Vintage'].max()
print(train['Vintage'].describe())
print(test['Vintage'].describe())
train['Age'].describe()
## Converting Vintage from months to years
test['Vintage']=((test['Vintage']/12))
train['Vintage']=((train['Vintage']/12))
COMBINING TRAIN AND TEST
LABEL ENCODING CATEGORIES
train['train_or_test']=1
test['train_or_test']=0
df=pd.concat([train,test])
category_cols=['Gender','Region_Code','Channel_Code','Occupation','Credit_Product','Is_Active',]
numerical_cols=['Age','Vintage','Avg_Account_Balance']
target_cols=['Is_Lead']
le = LabelEncoder()
for col in category_cols:
df[col]= df[col].astype('str')
df[col]= le.fit_transform(df[col])
df.shape
FREQUENCY ENCODING,CREATING INTERACTION FEATURES AND EXTRACTING FEATURES FROM AGGREGATE MEASURES
def frequency_encoding(column_name,output_column_name,df):
fe_pol = (df.groupby(column_name).size()) / len(df)
df[output_column_name] = df[column_name].apply(lambda x : fe_pol[x])
def feature_engineering(df):
cat_features=[]
columns=['Gender','Region_Code','Channel_Code','Occupation','Credit_Product','Is_Active']
comb = combinations(columns, 2)
for i in list(comb):
df[f'{i[0]}_{i[1]}']=df[i[0]].astype(str)+'_'+df[i[1]].astype(str)
frequency_encoding(f'{i[0]}_{i[1]}',f'{i[0]}_{i[1]}',df)
cat_features.append(f'{i[0]}_{i[1]}')
#Frequency Encoding
frequency_encoding('Region_Code','Region_Code_fe',df)
frequency_encoding('Channel_Code','Channel_Code_fe',df)
frequency_encoding('Occupation','Occupation_fe',df)
#Deriving characteristics of each Region by creating aggregate features
Region_aggregate_features = df.groupby(['Region_Code']).agg({'Age': ['mean', 'max', 'min','std','sum'],
'Vintage': ['mean', 'max', 'min','std','sum'],
'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'],
'Channel_Code': ['nunique','count'],
'Gender': ['nunique','count'],
'Credit_Product': ['nunique','count'] ,
'Occupation': ['nunique','count'] ,
'Is_Active' :['nunique','count']
})
Region_aggregate_features.columns = ['Region_aggregate_features' + '_'.join(c).strip('_') for c in Region_aggregate_features.columns]
df = pd.merge(df, Region_aggregate_features, on = ['Region_Code'], how='left')
region_channel_aggregate_features = df.groupby(['Region_Code','Channel_Code']).agg({'Age': ['mean', 'max', 'min','std','sum'],
'Vintage': ['mean', 'max', 'min','std','sum'],
'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'],
'Channel_Code': ['nunique','count'],
'Gender': ['nunique','count'],
'Credit_Product': ['nunique','count'] ,
'Occupation': ['nunique','count'] ,
'Is_Active' :['nunique','count']
})
region_channel_aggregate_features.columns = ['region_channel_aggregate_features' + '_'.join(c).strip('_') for c in region_channel_aggregate_features.columns]
df = pd.merge(df,region_channel_aggregate_features, on = ['Region_Code','Channel_Code'], how='left')
region_occupation_aggregate_features = df.groupby(['Region_Code','Occupation']).agg({'Age': ['mean', 'max', 'min','std','sum'],
'Vintage': ['mean', 'max', 'min','std','sum'],
'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'],
'Channel_Code': ['nunique','count'],
'Gender': ['nunique','count'],
'Credit_Product': ['nunique','count'] ,
'Occupation': ['nunique','count'] ,
'Is_Active' :['nunique','count']
})
region_occupation_aggregate_features.columns = ['region_occupation_aggregate_features' + '_'.join(c).strip('_') for c in region_channel_aggregate_features.columns]
df = pd.merge(df,region_occupation_aggregate_features, on = ['Region_Code','Occupation'], how='left')
#age_aggregate_features=df.groupby(['Age']).agg({
#'Region_Code': ['nunique','count'],
#'Vintage': ['mean', 'max', 'min','std','sum'],
#'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'],
#'Channel_Code': ['nunique','count'],
#'Gender': ['nunique','count'],
#'Credit_Product': ['nunique','count'] ,
#'Occupation': ['nunique','count'] ,
#'Is_Active' :['nunique','count']
# })
#age_aggregate_features.columns = ['age_aggregate_features' + '_'.join(c).strip('_') for c in age_aggregate_features.columns]
#df = pd.merge(df, age_aggregate_features, on = ['Age'], how='left')
#Vintage_aggregate_features=df.groupby(['Vintage']).agg({'Age': ['mean', 'max', 'min','std','sum'],
#'Region_Code': ['nunique','count'],
#'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'],
#'Channel_Code': ['nunique','count'],
#'Gender': ['nunique','count'],
#'Credit_Product': ['nunique','count'] ,
#'Occupation': ['nunique','count'] ,
#'Is_Active' :['nunique','count']
#})
#Vintage_aggregate_features.columns = ['Vintage_aggregate_features' + '_'.join(c).strip('_') for c in Vintage_aggregate_features.columns]
#df = pd.merge(df, Vintage_aggregate_features, on = ['Vintage'], how='left')
Region_CodeChannel_Code_grpd = df.groupby(['Region_Code_Channel_Code']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Region_CodeChannel_Code_grpd.columns = ['grpd_by_Region_Code_Channel_Code' + '_'.join(c).strip('_') for c in Region_CodeChannel_Code_grpd.columns]
df = pd.merge(df, Region_CodeChannel_Code_grpd, on = ['Region_Code_Channel_Code'], how='left')
Channel_CodeOccupation_grpd = df.groupby(['Channel_Code_Occupation']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Channel_CodeOccupation_grpd.columns = ['grpd_by_Channel_Code_Occupation' + '_'.join(c).strip('_') for c in Channel_CodeOccupation_grpd.columns]
df = pd.merge(df, Channel_CodeOccupation_grpd, on = ['Channel_Code_Occupation'], how='left')
Occupation_Credit_grpd = df.groupby(['Occupation_Credit_Product']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Occupation_Credit_grpd.columns = ['grpd_by_Credit_Occupation' + '_'.join(c).strip('_') for c in Occupation_Credit_grpd.columns]
df = pd.merge(df, Occupation_Credit_grpd, on = ['Occupation_Credit_Product'], how='left')
Occupation_Active_grpd = df.groupby(['Occupation_Is_Active']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Occupation_Active_grpd.columns = ['grpd_by_Active_Occupation' + '_'.join(c).strip('_') for c in Occupation_Active_grpd.columns]
df = pd.merge(df, Occupation_Active_grpd, on = ['Occupation_Is_Active'], how='left')
Credit_Active_grpd = df.groupby(['Credit_Product_Is_Active']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Credit_Active_grpd.columns = ['grpd_by_Active_Credit' + '_'.join(c).strip('_') for c in Credit_Active_grpd.columns]
df = pd.merge(df, Credit_Active_grpd, on = ['Credit_Product_Is_Active'], how='left')
Region_Active_grpd = df.groupby(['Region_Code_Is_Active']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Region_Active_grpd.columns = ['grpd_by_Active_Region' + '_'.join(c).strip('_') for c in Region_Active_grpd.columns]
df = pd.merge(df, Region_Active_grpd, on = ['Region_Code_Is_Active'], how='left')
Channel_Active_grpd = df.groupby(['Channel_Code_Is_Active']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Channel_Active_grpd.columns = ['grpd_by_Active_Channel' + '_'.join(c).strip('_') for c in Channel_Active_grpd.columns]
df = pd.merge(df, Channel_Active_grpd, on = ['Channel_Code_Is_Active'], how='left')
Region_Credit_grpd = df.groupby(['Region_Code_Credit_Product']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Region_Credit_grpd.columns = ['grpd_by_Region_Credit' + '_'.join(c).strip('_') for c in Region_Credit_grpd.columns]
df = pd.merge(df, Region_Credit_grpd, on = ['Region_Code_Credit_Product'], how='left')
Channel_Credit_grpd = df.groupby(['Channel_Code_Credit_Product']).agg({
'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})
Channel_Credit_grpd.columns = ['grpd_by_Channel_Credit' + '_'.join(c).strip('_') for c in Channel_Credit_grpd.columns]
df = pd.merge(df, Channel_Credit_grpd, on = ['Channel_Code_Credit_Product'], how='left')
return df,cat_features
df,cat_features=feature_engineering(df)
cat_cols=category_cols+cat_features
cat_cols
for col in cat_cols:
df[col]=df[col].astype('int')
df[cat_cols].dtypes
df.shape
TRAIN_TEST SPLIT,DROPPING COLUMNS,TOTAL OF 138 FEATURES
MODEL BUILDING
train=df.loc[df.train_or_test.isin([1])]
test=df.loc[df.train_or_test.isin([0])]
drop_columns={'ID','Is_Lead','train_or_test'}
target=['Is_Lead']
x=train.drop(columns=drop_columns,axis=1)
y=train[target].astype('int')
x_test=test.drop(columns=drop_columns,axis=1)
print(x.shape)
MODEL-1
LGBM WITH STRATIFIED KFOLD
params= params=([('colsample_bytree', 0.3706219857878677),
('learning_rate', 0.018102685623591585),
('max_bin', 941),
('max_depth', 2),
('Is_unbalance',True),
('min_child_samples', 22),
('min_child_weight', 4),
('n_estimators', 12041),
('num_leaves', 17),
('reg_alpha', 1.081049236893711e-05),
('reg_lambda', 1.043686239159047),
('scale_pos_weight', 0.19222548462579486),
('subsample', 0.60),
('subsample_for_bin', 375140),
('category_features', cat_cols),
('subsample_freq', 7)])
err = []
oofs = np.zeros(shape=(len(x)))
preds = np.zeros(shape=(len(x_test)))
Folds=8
fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2021)
i = 1
for train_index, test_index in fold.split(x, y):
x_train, x_val = x.iloc[train_index], x.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
m = lgb.LGBMClassifier(params=params,verbose= -1)
m.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=50,verbose=False,eval_metric='auc',)
pred_y = m.predict_proba(x_val)[:,1]
oofs[test_index] = pred_y
print(i, " err_lgm: ", roc_auc_score(y_val,pred_y))
err.append(roc_auc_score(y_val,pred_y))
preds+= m.predict_proba(x_test)[:,1]
i = i + 1
preds=preds/Folds
print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
oof_score = roc_auc_score(y, oofs)
print(f'\nOOF Auc is : {oof_score}')
oofs=pd.DataFrame(oofs,columns=['lgbmoof'])
preds0=pd.DataFrame(preds,columns=['lgbmpred'])
LGBM
submission=pd.read_csv('sample_submission_eyYijxG.csv')
submission['Is_Lead']=preds0['lgbmpred']
submission.to_csv('lgbm_preds_30May_final.csv',index=False)
XGBOOST WITH STRATIFIED K FOLD
MODEL-2
params=([('colsample_bylevel', 0.3706219857878677),
('colsample_bytree', 0.6142670193823258),
('gamma', 0.1331227203252178),
('Is_unbalance', True),
('objective', 'binary:logistic'),
('tree_method', 'approx'),
('learning_rate', 0.03450570695385555),
('max_delta_step', 2),
('max_depth', 5),
('min_child_weight', 4),
('n_estimators', 97),
('reg_alpha', 1.081049236893711e-05),
('reg_lambda', 1.043686239159047),
('category_features', cat_cols),
('scale_pos_weight', 0.19222548462579486)])
err = []
oofs = np.zeros(shape=(len(x)))
preds = np.zeros(shape=(len(x_test)))
Folds=8
fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=123)
i = 1
for train_index, test_index in fold.split(x, y):
x_train, x_val = x.iloc[train_index], x.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
m2 = xgb.XGBClassifier(params=params,verbose=-1)
m2.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=50,verbose=1000,eval_metric='auc')
pred_y = m2.predict_proba(x_val)[:,1]
oofs[test_index] = pred_y
print(i, " err_xgb: ", roc_auc_score(y_val,pred_y))
err.append(roc_auc_score(y_val,pred_y))
preds+= m2.predict_proba(x_test)[:,1]
i = i + 1
preds=preds/Folds
print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
oof_score = roc_auc_score(y, oofs)
print(f'\nOOF Auc is : {oof_score}')
oofs1=pd.DataFrame(oofs,columns=['xgboof'])
preds1=pd.DataFrame(preds,columns=['xgbpred'])
submission1=pd.read_csv('sample_submission_eyYijxG.csv')
submission1['Is_Lead']=preds1['xgbpred']
submission1.shape
submission1.to_csv('xgb_preds_30May_final.csv',index=False)
MODEL-3
err = []
oofs = np.zeros(shape=(len(x)))
preds = np.zeros(shape=(len(x_test)))
Folds=8
fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2020)
i = 1
for train_index, test_index in fold.split(x, y):
x_train, x_val = x.iloc[train_index], x.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
m3 = CatBoostClassifier(iterations=50,learning_rate=0.40013,depth=5,border_count=101,bagging_temperature=0.086008,
l2_leaf_reg=6,random_strength=4.4552e-05,
scale_pos_weight=0.58518,verbose= -1,nan_mode='Min',cat_features=cat_cols)
m3.fit(x_train, y_train,eval_set=[(x_val, y_val)],verbose=False)
pred_y = m3.predict_proba(x_val)[:,1]
oofs[test_index] = pred_y
print(i, " err_lgm: ", roc_auc_score(y_val,pred_y))
err.append(roc_auc_score(y_val,pred_y))
preds+= m3.predict_proba(x_test)[:,1]
i = i + 1
preds=preds/Folds
print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
oof_score = roc_auc_score(y, oofs)
print(f'\nOOF Auc is : {oof_score}')
oofs=pd.DataFrame(oofs,columns=['cbmoof'])
preds3=pd.DataFrame(preds,columns=['cbmpred'])
submission2=pd.read_csv('sample_submission_eyYijxG.csv')
submission2['Is_Lead']=preds3['cbmpred']
submission2.shape
submission2.to_csv('cb_preds_30May_final.csv',index=False)
import pandas as pd
def get_lgbm_varimp(model, train_columns, max_vars=50):
if "basic.Booster" in str(model.__class__):
# lightgbm.basic.Booster was trained directly, so using feature_importance() function
cv_varimp_df = pd.DataFrame([train_columns, model.feature_importance()]).T
else:
# Scikit-learn API LGBMClassifier or LGBMRegressor was fitted,
# so using feature_importances_ property
cv_varimp_df = pd.DataFrame([train_columns, model.feature_importances_]).T
cv_varimp_df.columns = ['feature_name', 'varimp']
cv_varimp_df.sort_values(by='varimp', ascending=False, inplace=True)
cv_varimp_df = cv_varimp_df.iloc[0:max_vars]
return cv_varimp_df
get_lgbm_varimp(m,x.columns,max_vars=25)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def plotImp(model):
feature_imp = get_lgbm_varimp(m,x.columns,max_vars=20)
plt.figure(figsize=(40, 20))
sns.set(font_scale = 5)
sns.barplot(x="varimp", y="feature_name", data=feature_imp.sort_values(by="varimp",
ascending=False)[0:10])
plt.title('LGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgb_importances-01.png')
plt.show()
plotImp(m)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def plotImp(model):
feature_imp = get_lgbm_varimp(m2,x.columns,max_vars=20)
plt.figure(figsize=(40, 20))
sns.set(font_scale = 5)
sns.barplot(x="varimp", y="feature_name", data=feature_imp.sort_values(by="varimp",
ascending=False)[0:10])
plt.title('XGB Features (avg over folds)')
plt.tight_layout()
plt.savefig('XGB_importances.png')
plt.show()
get_lgbm_varimp(m2,x.columns,max_vars=25)
plotImp(m2)
shap.initjs()
explainer = shap.TreeExplainer(m3)
categorical_features_indices = np.where(x_train.columns.isin(cat_cols))[0]
categorical_features_indices
shap_values = explainer.shap_values(Pool(x_train, y_train,cat_features=categorical_features_indices))
# summarize the effects of all the features
shap.summary_plot(shap_values,x_train)
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values[0:500,:], x_train.iloc[0:500,:])
# feature importance plot
shap.summary_plot(shap_values, x_train, plot_type="bar",color='rgb')
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[4,:], x_train.iloc[4,:])